# install.packages("ggplot2")
library(ggplot2)
The starting (expanded) dataset:
<dataframe> <- read.csv("<filename>",
+ stringsAsFactors = FALSE)
str(<dataframe>) # structure of <dataframe>, all variables/columns
summary(<dataframe>) # summarizing column values in the <dataframe>
the.beatles.songs <-
read.csv("The Beatles songs dataset, v3.csv", stringsAsFactors = FALSE)
str(the.beatles.songs)
'data.frame': 310 obs. of 15 variables:
$ Title : chr "12-Bar Original" "A Day in the Life" "A Hard Day's Night" "A Shot of Rhythm and Blues" ...
$ Year : chr "1965" "1967" "1964" "1963" ...
$ Duration : int 174 335 152 104 163 230 139 NA 124 124 ...
$ Other.releases : int NA 12 35 NA 29 19 14 9 9 32 ...
$ Single.A.side : chr "" "Sgt. Pepper's Lonely Hearts Club Band / With a Little Help from My Friends" "A Hard Day's Night; A Hard Day's Night" "" ...
$ Single.B.side : chr "" "A Day in the Life" "Things We Said Today; I Should Have Known Better" "" ...
$ Single.certification: chr "" "" "RIAA Gold" "" ...
$ Genre : chr "Blues" "Psychedelic Rock, Art Rock, Pop/Rock" "Rock, Electronic, Pop/Rock" "R&B, Pop/Rock" ...
$ Songwriter : chr "Lennon, McCartney, Harrison and Starkey" "Lennon and McCartney" "Lennon" "Thompson" ...
$ Lead.vocal : chr "" "Lennon and McCartney" "Lennon, with McCartney" "Lennon" ...
$ Cover : chr "" "" "" "Y" ...
$ Covered.by : int NA 27 35 NA NA 32 NA NA 2 20 ...
$ Top.50.Billboard : int NA NA 8 NA NA NA 50 41 NA NA ...
$ Top.50.Rolling.Stone: int NA 1 11 NA NA NA NA NA NA 44 ...
$ Top.50.NME : int NA 2 19 NA NA 7 NA NA NA 35 ...
summary(the.beatles.songs)
Title Year Duration Other.releases Single.A.side Single.B.side
Length:310 Length:310 Min. : 23.0 Min. : 2.00 Length:310 Length:310
Class :character Class :character 1st Qu.:130.0 1st Qu.: 8.00 Class :character Class :character
Mode :character Mode :character Median :149.0 Median :12.50 Mode :character Mode :character
Mean :160.6 Mean :14.96
3rd Qu.:176.0 3rd Qu.:19.25
Max. :502.0 Max. :56.00
NA's :29 NA's :94
Single.certification Genre Songwriter Lead.vocal Cover
Length:310 Length:310 Length:310 Length:310 Length:310
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
Covered.by Top.50.Billboard Top.50.Rolling.Stone Top.50.NME
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 3.00 1st Qu.:13.00 1st Qu.:13.00 1st Qu.:13.00
Median : 7.00 Median :25.00 Median :26.00 Median :26.00
Mean :11.50 Mean :25.31 Mean :25.55 Mean :25.69
3rd Qu.:15.75 3rd Qu.:38.00 3rd Qu.:38.00 3rd Qu.:38.00
Max. :70.00 Max. :50.00 Max. :50.00 Max. :50.00
NA's :128 NA's :261 NA's :261 NA's :261
A modified version of the dataset has been saved earlier using:
saveRDS(object = <dataframe or another R object>, file = "<filename>") # save R object for the next session
Restoring the dataset from the corresponding RData file:
<dataframe or another R object> <- readRDS(file = "<filename>") # restore R object in the next session
The Beatles songs dataset has been saved earlier using:
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.2.RData")
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.2.RData")
summary(the.beatles.songs)
Title Year Duration Other.releases Single.certification Cover
Length:310 1963 :66 Min. : 23.0 Min. : 0.00 No :259 N:239
Class :character 1968 :45 1st Qu.:133.0 1st Qu.: 0.00 RIAA 2xPlatinum : 6 Y: 71
Mode :character 1969 :43 Median :150.0 Median : 9.00 RIAA 4xPlatinum : 2
1964 :41 Mean :159.6 Mean :10.42 RIAA Gold : 33
1965 :37 3rd Qu.:172.8 3rd Qu.:16.00 RIAA Gold, BPI Silver: 2
1967 :27 Max. :502.0 Max. :56.00 RIAA Platinum : 8
(Other):51
Covered.by Top.50.Billboard Top.50.Rolling.Stone Top.50.NME
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0
1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0
Median : 2.000 Median : 0.000 Median : 0.000 Median : 0
Mean : 6.752 Mean : 4.061 Mean : 4.023 Mean : 4
3rd Qu.: 8.000 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0
Max. :70.000 Max. :50.000 Max. :50.000 Max. :50
Decide on the output variable and make appropriate adjustments.
Examine the Top.50.Billboard variable more closely:
the.beatles.songs$Top.50.Billboard
[1] 0 0 43 0 0 0 1 10 0 0 0 0 36 14 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 41
[36] 0 0 0 0 0 0 0 0 0 0 45 0 0 0 22 0 0 0 0 0 25 0 0 0 0 0 0 30 13 0 0 0 0 0 0
[71] 0 0 0 12 0 0 47 0 0 0 0 0 0 0 0 0 28 0 0 0 0 44 37 0 0 0 0 0 50 0 0 0 0 0 0
[106] 0 2 40 0 0 0 0 0 0 0 15 0 0 49 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[141] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 29 0 0 46 0 0 0 0 0 0 35 0 0 0 0 0 0 0 0
[176] 0 0 11 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 24 0 0 0 0 0 0 0 0 17 32 27 0 0 33 0 9
[211] 4 0 0 19 0 0 0 0 0 0 0 0 0 0 0 0 0 48 0 20 0 0 0 7 0 0 0 21 0 0 18 0 0 0 0
[246] 0 0 0 0 0 5 0 0 0 23 0 0 0 0 0 31 0 0 0 0 0 0 0 0 0 34 0 0 0 0 0 38 0 0 0
[281] 42 0 0 0 0 0 0 0 0 0 0 0 0 0 26 0 0 39 0 0 0 0 0 0 0 0 0 0 0 0
Output variable: a song is among Billboard’s top 50 Beatles songs or it isn’t (Yes/No, TRUE/FALSE).
Create a new varible to represent the output variable:
the.beatles.songs$Top.50.BB <- "No"
the.beatles.songs$Top.50.BB
[1] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[22] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[43] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[64] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[85] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[106] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[127] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[148] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[169] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[190] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[211] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[232] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[253] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[274] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[295] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
the.beatles.songs$Top.50.BB[the.beatles.songs$Top.50.Billboard > 0] <- "Yes"
the.beatles.songs$Top.50.BB
[1] "No" "No" "Yes" "No" "No" "No" "Yes" "Yes" "No" "No" "No" "No" "Yes" "Yes" "No" "No" "No"
[18] "No" "No" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[35] "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "Yes" "No" "No" "No" "Yes" "No"
[52] "No" "No" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "Yes" "Yes" "No" "No" "No" "No"
[69] "No" "No" "No" "No" "No" "Yes" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No"
[86] "No" "Yes" "No" "No" "No" "No" "Yes" "Yes" "No" "No" "No" "No" "No" "Yes" "No" "No" "No"
[103] "No" "No" "No" "No" "Yes" "Yes" "No" "No" "No" "No" "No" "No" "No" "Yes" "No" "No" "Yes"
[120] "No" "No" "No" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[137] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[154] "No" "No" "No" "Yes" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "Yes" "No" "No" "No"
[171] "No" "No" "No" "No" "No" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No"
[188] "Yes" "No" "No" "No" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "Yes" "Yes"
[205] "Yes" "No" "No" "Yes" "No" "Yes" "Yes" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No"
[222] "No" "No" "No" "No" "No" "No" "Yes" "No" "Yes" "No" "No" "No" "Yes" "No" "No" "No" "Yes"
[239] "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No" "Yes" "No" "No" "No" "Yes"
[256] "No" "No" "No" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No" "Yes" "No"
[273] "No" "No" "No" "No" "Yes" "No" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No"
[290] "No" "No" "No" "No" "No" "Yes" "No" "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No"
[307] "No" "No" "No" "No"
Turn the new variable into a factor:
the.beatles.songs$Top.50.BB <- as.factor(the.beatles.songs$Top.50.BB)
head(the.beatles.songs$Top.50.BB)
[1] No No Yes No No No
Levels: No Yes
Save this version of the dataset as well, for future reuse:
saveRDS(object = <dataframe or another R object>, file = "<filename>") # save R object for the next session
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.3.RData")
Examine the distribution of the two factor values more precisely:
table(<dataset>$<output variable>)
prop.table(table(<dataset>$<output variable>))
round(prop.table(table(<dataset>$<output variable>)), digits = 2)
table(the.beatles.songs$Top.50.BB)
No Yes
261 49
prop.table(table(the.beatles.songs$Top.50.BB))
No Yes
0.8419355 0.1580645
round(prop.table(table(the.beatles.songs$Top.50.BB)), digits = 2)
No Yes
0.84 0.16
Split the dataset into train and test sets:
# install.packages("caret")
library(caret)
set.seed(<n>)
<train dataset indices> <- # stratified partitioning:
+ createDataPartition(<dataset>$<output variable>, # the same distribution of the output variable in both sets
+ p = .80, # 80/20% of data in train/test sets
+ list = FALSE) # don't make a list of results, make a matrix
<train dataset> <- <dataset>[<train dataset indices>, ]
<test dataset> <- <dataset>[-<train dataset indices>, ]
library(caret)
set.seed(444)
# set.seed(333) - results in a different split, and different tree and evaluation metrics
train.data.indices <- createDataPartition(the.beatles.songs$Top.50.BB, p = 0.80, list = FALSE)
train.data <- the.beatles.songs[train.data.indices, ]
test.data <- the.beatles.songs[-train.data.indices, ]
Build the model / decision tree:
install.packages("rpart")
library(rpart)
<decision tree> <- rpart(<output variable> ~ # build the tree
+ <predictor variable 1> + <predictor variable 2> + ..., # . to include all variables
+ data = <train dataset>,
+ method = "class") # build classification tree
print(<decision tree>) # default textual representation
library(rpart)
top.50.tree.1 <- rpart(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "class")
print(top.50.tree.1)
n= 249
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 249 40 No (0.83935743 0.16064257)
2) Single.certification=No 208 8 No (0.96153846 0.03846154) *
3) Single.certification=RIAA 2xPlatinum,RIAA 4xPlatinum,RIAA Gold,RIAA Gold, BPI Silver,RIAA Platinum 41 9 Yes (0.21951220 0.78048780)
6) Covered.by< 5.5 10 4 No (0.60000000 0.40000000) *
7) Covered.by>=5.5 31 3 Yes (0.09677419 0.90322581) *
Depict the model:
# install.packages("rattle")
# install.packages("rpart.plot")
# install.packages("RColorBrewer")
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(<decision tree>)
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(top.50.tree.1)
Make predictions using the model built in the previous step:
<predictions> <- predict(object = <decision tree>,
+ newdata = <test dataset>,
+ type = "class")
<predictions>[<i1>:<ik>] # examine some of the predictions
<predictions dataframe> <-
+ data.frame(<observation ID> = <test dataset>$<observation ID column>,
+ <another relevant feature> = <test dataset>$<another relevant feature column>,
+ ...,
+ <output feature> = <test dataset>$<output variable>,
+ <predictions feature> = <predictions>)
top.50.predictions.1 <- predict(top.50.tree.1, newdata = test.data, type = "class")
top.50.predictions.1[1:20]
1 2 3 5 11 15 19 22 25 27 42 60 62 71 72 78 85 99 102 107
No No Yes No No No No No No No No Yes No No No No No Yes No No
Levels: No Yes
top.50.predictions.1.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.1)
How good are the predictions?
Compute confusion matrix:
<cm> <- table(True = <predictions dataframe>$<output variable>,
+ Predicted = <predictions dataframe>$<predictions>)
Compute evaluation metrics:
Note: precision and recall are inversely proportional to each other.
<evaluation metrics vector> <- <user-specified function>(<cm>)
# accuracy = sum(diag(cm)) / sum(cm)
# precision <- TP / (TP + FP)
# recall <- TP / (TP + FN)
# F1 <- (2 * precision * recall) / (precision + recall)
cm.1 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.1)
cm.1
Predicted
True No Yes
No 51 1
Yes 5 4
# alternatively:
# cm.1 <- table(True = top.50.predictions.1.dataframe$Top.50.BB,
# Predicted = top.50.predictions.1.dataframe$Prediction)
# cm.1
source("Evaluation metrics.R")
eval.1 <- getEvaluationMetrics(cm.1)
eval.1
Accuracy Precision Recall F1
0.9016393 0.8000000 0.4444444 0.5714286
Try another tree, using fewer and/or different predictors (e.g., Duration + Covered.by, Duration + Year, etc.). # In practice, strong predictors from the previous tree are kept in the new tree as well.
Also try changing the seed in splitting the dataset into # train and test sets.
top.50.tree.2 <- rpart(Top.50.BB ~ Duration + Covered.by,
data = train.data,
method = "class")
# top.50.tree.2 <- rpart(Top.50.BB ~ ., # use almost all variables, excluding some specific ones
# data = subset(train.data, select = -c(Title, Top.50.Billboard)),
# method = "class")
print(top.50.tree.2)
n= 249
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 249 40 No (0.83935743 0.16064257)
2) Covered.by< 5.5 168 8 No (0.95238095 0.04761905) *
3) Covered.by>=5.5 81 32 No (0.60493827 0.39506173)
6) Covered.by< 28 64 20 No (0.68750000 0.31250000)
12) Duration>=207.5 13 2 No (0.84615385 0.15384615) *
13) Duration< 207.5 51 18 No (0.64705882 0.35294118)
26) Duration< 182.5 42 12 No (0.71428571 0.28571429)
52) Duration>=150 19 3 No (0.84210526 0.15789474) *
53) Duration< 150 23 9 No (0.60869565 0.39130435)
106) Duration< 129 11 2 No (0.81818182 0.18181818) *
107) Duration>=129 12 5 Yes (0.41666667 0.58333333) *
27) Duration>=182.5 9 3 Yes (0.33333333 0.66666667) *
7) Covered.by>=28 17 5 Yes (0.29411765 0.70588235) *
fancyRpartPlot(top.50.tree.2)
top.50.predictions.2 <- predict(top.50.tree.2, newdata = test.data, type = "class")
top.50.predictions.2[1:20]
1 2 3 5 11 15 19 22 25 27 42 60 62 71 72 78 85 99 102 107
No No Yes No No No No No No No No No Yes No No No Yes Yes No No
Levels: No Yes
top.50.predictions.2.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.2)
cm.2 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.2)
cm.2
Predicted
True No Yes
No 42 10
Yes 6 3
eval.2 <- getEvaluationMetrics(cm.2)
eval.2
Accuracy Precision Recall F1
0.7377049 0.2307692 0.3333333 0.2727273
Controlling rpart parameters:
# install.packages("rpart")
library(rpart)
<decision tree> <- rpart(<output variable> ~ # build the tree
+ <predictor variable 1> + <predictor variable 2> + ..., # . to include all variables
+ data = <train dataset>,
+ method = "class", # build classification tree
+ control = rpart.control(minsplit = <n>, cp = <q>)) # decrease both for larger tree
print(<decision tree>) # default textual representation
top.50.tree.3 <- rpart(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "class",
control = rpart.control(minsplit = 10, cp = 0.001))
print(top.50.tree.3)
n= 249
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 249 40 No (0.83935743 0.16064257)
2) Single.certification=No 208 8 No (0.96153846 0.03846154)
4) Year=1958,1960,1962,1963,1964,1965,1966,1967,1968,1969,1970 205 6 No (0.97073171 0.02926829)
8) Covered.by< 17.5 189 2 No (0.98941799 0.01058201) *
9) Covered.by>=17.5 16 4 No (0.75000000 0.25000000)
18) Year=1965,1966,1967,1968,1969 12 1 No (0.91666667 0.08333333) *
19) Year=1963,1964 4 1 Yes (0.25000000 0.75000000) *
5) Year=1961 3 1 Yes (0.33333333 0.66666667) *
3) Single.certification=RIAA 2xPlatinum,RIAA 4xPlatinum,RIAA Gold,RIAA Gold, BPI Silver,RIAA Platinum 41 9 Yes (0.21951220 0.78048780)
6) Covered.by< 5.5 10 4 No (0.60000000 0.40000000)
12) Year=1964,1968,1969 5 0 No (1.00000000 0.00000000) *
13) Year=1962,1963,1965,1980 5 1 Yes (0.20000000 0.80000000) *
7) Covered.by>=5.5 31 3 Yes (0.09677419 0.90322581) *
fancyRpartPlot(top.50.tree.3)
top.50.predictions.3 <- predict(top.50.tree.3, newdata = test.data, type = "class")
top.50.predictions.3[1:20]
1 2 3 5 11 15 19 22 25 27 42 60 62 71 72 78 85 99 102 107
No No Yes No No No No No No No Yes Yes No No No No No Yes No No
Levels: No Yes
top.50.predictions.3.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.3)
cm.3 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.3)
cm.3
Predicted
True No Yes
No 50 2
Yes 5 4
eval.3 <- getEvaluationMetrics(cm.3)
eval.3
Accuracy Precision Recall F1
0.8852459 0.6666667 0.4444444 0.5333333
Compare the results (the corresponding models/trees):
data.frame(rbind(<evaluation metrics 1>, <evaluation metrics 2>),
+ row.names = c("<tree 1>", "<tree 2>"))
data.frame(rbind(eval.1, eval.3),
row.names = c("top.50.tree.1", "top.50.tree.3"))
Model 3 exhibits overfitting. It is a frequent case with large trees.
Cross-validate the model - find the optimal value for cp (the most important parameter), in order to avoid overfitting the model to the training data:
# install.packages("e1071") # relevant caret functions need e1071
# install.packages("caret")
library(e1071)
library(caret)
<folds> = trainControl(method = "cv", number = <k>) # define <k>-fold cross-validation parameters
<cpGrid> = expand.grid(.cp = # specify the range of the cp values to examine
+ seq(from = <start value>, to = <end value>, by = <step>))
set.seed(<seed>)
train(<output variable> ~ # find the optimal value for cp
+ <predictor variable 1> + <predictor variable 2> + ..., # . to include all variables
+ data = <train dataset>,
+ method = "rpart", # use rpart() to build multiple classification trees
+ control = rpart.control(minsplit = 10),
+ trControl = <folds>, tuneGrid = <cpGrid>) # <folds> and <cpGrid> from above
library(e1071)
library(caret)
folds = trainControl(method = "cv", number = 10) # 10-fold cross-validation
cpGrid = expand.grid(.cp = seq(from = 0.001, to = 0.05, by = 0.001))
set.seed(11)
train(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "rpart",
control = rpart.control(minsplit = 10),
trControl = folds, tuneGrid = cpGrid)
CART
249 samples
3 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 224, 224, 225, 224, 224, 224, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.001 0.9113333 0.6777534
0.002 0.9113333 0.6777534
0.003 0.9113333 0.6777534
0.004 0.9113333 0.6777534
0.005 0.9113333 0.6777534
0.006 0.9113333 0.6777534
0.007 0.9113333 0.6777534
0.008 0.9113333 0.6777534
0.009 0.9113333 0.6777534
0.010 0.9113333 0.6777534
0.011 0.9113333 0.6777534
0.012 0.9113333 0.6777534
0.013 0.9113333 0.6777534
0.014 0.9028333 0.6741848
0.015 0.9028333 0.6741848
0.016 0.9028333 0.6741848
0.017 0.9028333 0.6741848
0.018 0.9028333 0.6741848
0.019 0.9028333 0.6741848
0.020 0.9028333 0.6741848
0.021 0.9028333 0.6741848
0.022 0.9028333 0.6741848
0.023 0.9028333 0.6741848
0.024 0.9028333 0.6741848
0.025 0.9028333 0.6741848
0.026 0.9028333 0.6741848
0.027 0.9028333 0.6741848
0.028 0.8828333 0.5697261
0.029 0.8828333 0.5697261
0.030 0.8828333 0.5697261
0.031 0.8828333 0.5697261
0.032 0.8828333 0.5697261
0.033 0.8828333 0.5697261
0.034 0.8828333 0.5697261
0.035 0.8828333 0.5697261
0.036 0.8828333 0.5697261
0.037 0.8828333 0.5697261
0.038 0.8828333 0.5697261
0.039 0.8828333 0.5697261
0.040 0.8828333 0.5697261
0.041 0.8828333 0.5697261
0.042 0.8748333 0.5248611
0.043 0.8748333 0.5248611
0.044 0.8748333 0.5248611
0.045 0.8748333 0.5248611
0.046 0.8748333 0.5248611
0.047 0.8748333 0.5248611
0.048 0.8748333 0.5248611
0.049 0.8748333 0.5248611
0.050 0.8748333 0.5248611
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.013.
Prunning a complex tree using the optimal cp value: <prunned decision tree> <- prune(<decision tree>, cp = <optimal cp value>)
print(<prunned decision tree>)
fancyRpartPlot(<prunned decision tree>)
top.50.tree.5 <- prune(top.50.tree.3, cp = 0.013) # cp value found in the previous step (train())
print(top.50.tree.5)
n= 249
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 249 40 No (0.83935743 0.16064257)
2) Single.certification=No 208 8 No (0.96153846 0.03846154)
4) Year=1958,1960,1962,1963,1964,1965,1966,1967,1968,1969,1970 205 6 No (0.97073171 0.02926829)
8) Covered.by< 17.5 189 2 No (0.98941799 0.01058201) *
9) Covered.by>=17.5 16 4 No (0.75000000 0.25000000)
18) Year=1965,1966,1967,1968,1969 12 1 No (0.91666667 0.08333333) *
19) Year=1963,1964 4 1 Yes (0.25000000 0.75000000) *
5) Year=1961 3 1 Yes (0.33333333 0.66666667) *
3) Single.certification=RIAA 2xPlatinum,RIAA 4xPlatinum,RIAA Gold,RIAA Gold, BPI Silver,RIAA Platinum 41 9 Yes (0.21951220 0.78048780)
6) Covered.by< 5.5 10 4 No (0.60000000 0.40000000)
12) Year=1964,1968,1969 5 0 No (1.00000000 0.00000000) *
13) Year=1962,1963,1965,1980 5 1 Yes (0.20000000 0.80000000) *
7) Covered.by>=5.5 31 3 Yes (0.09677419 0.90322581) *
fancyRpartPlot(top.50.tree.5)
Make predictions with the prunned tree:
top.50.predictions.5 <- predict(top.50.tree.5, newdata = test.data, type = "class")
top.50.predictions.5[1:20]
1 2 3 5 11 15 19 22 25 27 42 60 62 71 72 78 85 99 102 107
No No Yes No No No No No No No Yes Yes No No No No No Yes No No
Levels: No Yes
top.50.predictions.5.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.5)
cm.5 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.5)
cm.5
Predicted
True No Yes
No 50 2
Yes 5 4
eval.5 <- getEvaluationMetrics(cm.5)
eval.5
Accuracy Precision Recall F1
0.8852459 0.6666667 0.4444444 0.5333333
Compare all relevant models: data.frame(rbind(<evaluation metrics 1>, <evaluation metrics 2>, ...),
+ row.names = c("<tree 1>", "<tree 2>", ...))
data.frame(rbind(eval.1, eval.3, eval.5),
row.names = c("top.50.tree.1", "top.50.tree.3", "top.50.tree.5"))
Restoring the dataset from the corresponding RData file:
<dataframe or another R object> <- readRDS(file = "<filename>") # restore R object in the next session
The Beatles songs dataset has been saved earlier using:
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.4.RData")
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.4.RData")
Eliminate Top.50.Billboard from the dataset. Top.50.BB has been created from Top.50.Billboard, hence Top.50.Billboard predicts Top.50.BB 100%. Still, store Top.50.Billboard in a separate vector for possible later use.
top.50.billboard <- the.beatles.songs$Top.50.Billboard
the.beatles.songs$Top.50.Billboard <- NULL
Save this version of the dataset as well, for future reuse:
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.5.RData")
Is rescaling of numeric variables needed? Are their ranges different?
summary(<dataframe>) # examine the ranges of numeric variables
summary(the.beatles.songs)
Title Year Duration Other.releases Single.certification Cover
Length:310 1963 :66 Min. : 23.0 Min. : 0.00 No :259 N:239
Class :character 1968 :45 1st Qu.:133.0 1st Qu.: 0.00 RIAA 2xPlatinum : 6 Y: 71
Mode :character 1969 :43 Median :150.0 Median : 9.00 RIAA 4xPlatinum : 2
1964 :41 Mean :159.6 Mean :10.42 RIAA Gold : 33
1965 :37 3rd Qu.:172.8 3rd Qu.:16.00 RIAA Gold, BPI Silver: 2
1967 :27 Max. :502.0 Max. :56.00 RIAA Platinum : 8
(Other):51
Covered.by Top.50.Rolling.Stone Top.50.NME Top.50.BB
Min. : 0.000 Min. : 0.000 Min. : 0 No :261
1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0 Yes: 49
Median : 2.000 Median : 0.000 Median : 0
Mean : 6.752 Mean : 4.023 Mean : 4
3rd Qu.: 8.000 3rd Qu.: 0.000 3rd Qu.: 0
Max. :70.000 Max. :50.000 Max. :50
Check if numeric variables follow normal distribution:
summary(<numeric variable>) # the mean and the median values similar: probably normal distribution
plot(density((<numeric variable>)) # visual inspection
hist(<numeric variable>) # visual inspection
qqnorm(<numeric variable>) # values lie more or less along the diagonal (straight line)
shapiro.test(<numeric variable>) # good for small sample sizes, e.g. n < ~2000; H0: normal distribution
plot(density(the.beatles.songs$Covered.by))
# ... # check the distributions of other variables as well
Since the distribution of numeric variables shows that rescaling is needed:
source("Rescale numeric variables.R")
the.beatles.songs.rescaled <- rescaleNumericVariables(the.beatles.songs)
Split the dataset into train and test sets:
# install.packages("caret")
library(caret)
set.seed(<n>)
<train dataset indices> <- # stratified partitioning:
+ createDataPartition(<dataset>$<output variable>, # the same distribution of the output variable in both sets
+ p = .80, # 80/20% of data in train/test sets
+ list = FALSE) # don't make a list of results, make a matrix
<train dataset> <- <dataset>[<train dataset indices>, ]
<test dataset> <- <dataset>[-<train dataset indices>, ]
library(caret)
set.seed(444)
# set.seed(333) - results in a different split, and different results and eval. metrics
train.data.indices <- createDataPartition(the.beatles.songs.rescaled$Top.50.BB, p = 0.80, list = FALSE)
train.data <- the.beatles.songs.rescaled[train.data.indices, ]
test.data <- the.beatles.songs.rescaled[-train.data.indices, ]
Build the model:
library(class)
<knn model> <- knn(train = <training dataset>, # training data without the output (class) variable
+ test = <test dataset>, # test data without the output (class) variable
+ cl = <class values for training>, # output (class) variable is specified here
+ k = <n>) # <n>: random guess, or obtained from cross-validation
head(<knn model>)
library(class)
top.50.knn.1 <- knn(train = train.data[, -c(1, 10)], # eliminate Title (non-numeric) and output/class (Top.50.BB)
test = test.data[, -c(1, 10)], # eliminate Title (non-numeric) and output/class (Top.50.BB)
cl = train.data$Top.50.BB, # output (class) variable
k = 5) # k = 5: random value to start with
head(top.50.knn.1) # these are already the predictions, i.e. no predict() etc.
[1] No No Yes No No No
Levels: No Yes
which(test.data$Top.50.BB != top.50.knn.1)
[1] 11 29 41 48
Compute confusion matrix:
<cm> <- table(True = <test dataset>$<output variable>,
+ Predicted = <test dataset>$<predictions>)
knn.cm.1 <- table(True = test.data$Top.50.BB, Predicted = top.50.knn.1)
knn.cm.1
Predicted
True No Yes
No 51 1
Yes 3 6
Compute evaluation metrics:
Note: precision and recall are inversely proportional to each other.
<evaluation metrics vector> <- <user-specified function>(<cm>)
# accuracy = sum(diag(cm)) / sum(cm)
# precision <- TP / (TP + FP)
# recall <- TP / (TP + FN)
# F1 <- (2 * precision * recall) / (precision + recall)
source("Evaluation metrics.R")
eval.knn.1 <- getEvaluationMetrics(knn.cm.1)
eval.knn.1
Accuracy Precision Recall F1
0.9344262 0.8571429 0.6666667 0.7500000
What if k had a different value?
Cross-validate the model - find the optimal value for k (the most important parameter), in order to avoid overfitting the model to the training data: # install.packages("e1071") # relevant caret functions need e1071
# install.packages("caret")
library(e1071)
library(caret)
<folds> = trainControl(method = "cv", number = <k>) # define <k>-fold cross-validation parameters
<cpGrid> = expand.grid(.k = # specify the range of the (odd) values to examine
seq(from = <start value>, to = <end value>, by = <step>))
set.seed(<seed>)
<knn cv> <- train(<output variable> ~ # find the optimal value for k
+ <predictor variable 1> + <predictor variable 2> + ..., # . to include all variables
+ data = <train dataset>,
+ method = "knn", # use knn() to build multiple classification models
+ trControl = <folds>, tuneGrid = <cpGrid>) # <folds> and <cpGrid> from above
library(e1071)
library(caret)
knn.folds = trainControl(method = "cv", number = 10) # 10-fold cross-validation
knn.cpGrid = expand.grid(.k = seq(from = 3, to = 25, by = 2))
set.seed(11)
knn.cv <- train(Top.50.BB ~ . - Title,
data = train.data,
method = "knn",
trControl = knn.folds, tuneGrid = knn.cpGrid)
knn.cv
k-Nearest Neighbors
249 samples
9 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 224, 224, 225, 224, 224, 224, ...
Resampling results across tuning parameters:
k Accuracy Kappa
3 0.9033333 0.5982314
5 0.9073333 0.6323177
7 0.9033333 0.6221013
9 0.9073333 0.6314675
11 0.9073333 0.6323177
13 0.9073333 0.6323177
15 0.9073333 0.6323177
17 0.9073333 0.6323177
19 0.9033333 0.6124107
21 0.9073333 0.6259243
23 0.9073333 0.6259243
25 0.9073333 0.6259243
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 25.
Plot the cross-validation results (accuracy for different values of k):
# plot(<knn model>) # the model obtained by <knn model> <- train(...)
plot(knn.cv)
Build the model and compute confusion matrix and evaluation metrics for another value of k:
top.50.knn.2 <- knn(train = train.data[, -c(1, 10)], # eliminate Title (non-numeric) and output/class (Top.50.BB)
test = test.data[, -c(1, 10)], # eliminate Title (non-numeric) and output/class (Top.50.BB)
cl = train.data$Top.50.BB, # output (class) variable
k = 7) # k = 7: another random value to test and compare
knn.cm.2 <- table(True = test.data$Top.50.BB, Predicted = top.50.knn.2)
knn.cm.2
Predicted
True No Yes
No 51 1
Yes 2 7
eval.knn.2 <- getEvaluationMetrics(knn.cm.2)
eval.knn.2
Accuracy Precision Recall F1
0.9508197 0.8750000 0.7777778 0.8235294
Compare the evaluation metrics of the two classifiers:
data.frame(rbind(<evaluation metrics 1>, <evaluation metrics 2>, ...),
+ row.names = c("<model 1>", "<model 2>", ...))
data.frame(rbind(eval.knn.1, eval.knn.2),
row.names = c("eval.knn.1", "eval.knn.2"))
Read an appropriate version of the dataset:
<dataframe or another R object> <- readRDS(file = "<filename>") # restore R object from another session
The Beatles songs dataset, v3.5.RData (without duplicated song names), has been saved in the session on KNN, assuming that the.beatles.songs$Top.50.BB (a factor variable) is the output variable:
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.5.RData")
The same output variable is assumed in this session.
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.5.RData")
str(the.beatles.songs)
'data.frame': 310 obs. of 10 variables:
$ Title : chr "12-Bar Original" "A Day in the Life" "A Hard Day's Night" "A Shot of Rhythm and Blues" ...
$ Year : Factor w/ 14 levels "1958","1960",..: 7 9 6 5 5 10 7 3 5 5 ...
$ Duration : num 174 335 152 104 163 230 139 150 124 124 ...
$ Other.releases : num 0 12 35 0 29 19 14 9 9 32 ...
$ Single.certification: Factor w/ 6 levels "No","RIAA 2xPlatinum",..: 1 1 4 1 1 1 4 1 1 1 ...
$ Cover : Factor w/ 2 levels "N","Y": 1 1 1 2 2 1 2 2 1 1 ...
$ Covered.by : num 0 27 35 0 0 32 0 0 2 20 ...
$ Top.50.Rolling.Stone: num 0 50 40 0 0 0 0 0 0 7 ...
$ Top.50.NME : num 0 49 32 0 0 44 0 0 0 16 ...
$ Top.50.BB : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 2 2 1 1 ...
Naive Bayes classification is typically used with categorical (factor) variables.
Numeric variables (if any) should be either:
Check numeric variables in the dataset for normality assumption:
apply(<numeric dataframe>, # <original dataframe>[, c(<num. col. 1>, <num. col. 1>, ...]
+ MARGIN = 2, # apply FUN by columns
+ FUN = shapiro.test) # Shapiro-Wilks' test of normality; good for small no. of observations (< 2000)
apply(the.beatles.songs[, c(3:4, 7:9)], MARGIN = 2, FUN = shapiro.test)
$Duration
Shapiro-Wilk normality test
data: newX[, i]
W = 0.79255, p-value < 2.2e-16
$Other.releases
Shapiro-Wilk normality test
data: newX[, i]
W = 0.88075, p-value = 8.343e-15
$Covered.by
Shapiro-Wilk normality test
data: newX[, i]
W = 0.66532, p-value < 2.2e-16
$Top.50.Rolling.Stone
Shapiro-Wilk normality test
data: newX[, i]
W = 0.41964, p-value < 2.2e-16
$Top.50.NME
Shapiro-Wilk normality test
data: newX[, i]
W = 0.4198, p-value < 2.2e-16
No normally distributed numeric variables in the dataset.
Discretize numeric variables using bnlearn::discretize():
library(bnlearn)
?discretize()
<new dataframe with discretized variables> <-
+ discretize(<numeric dataframe>, # <original dataframe>[, c(<num. col. 1>, <num. col. 1>, ...]
+ method = "quantile" | # use equal-frequency intervals (default)
+ method = "interval" | # use equal-length intervals
+ method = "hartemink", # use Hartemink's algorithm
+ breaks = c(<n1>, <n2>, ..., <ncol>)) # no. of discrete intervals for each column
library(bnlearn)
discretized.features <- discretize(the.beatles.songs[, c(3:4, 7:9)],
method = "interval",
breaks = c(5, 5, 5, 5, 5))
summary(discretized.features)
Duration Other.releases Covered.by Top.50.Rolling.Stone Top.50.NME
[22.5,119]: 38 [-0.056,11.2]:193 [-0.07,14]:261 [-0.05,10]:271 [-0.05,10]:271
(119,215] :238 (11.2,22.4] : 78 (14,28] : 27 (10,20] : 10 (10,20] : 10
(215,310] : 28 (22.4,33.6] : 27 (28,42] : 15 (20,30] : 9 (20,30] : 10
(310,406] : 3 (33.6,44.8] : 10 (42,56] : 6 (30,40] : 10 (30,40] : 9
(406,502] : 3 (44.8,56.1] : 2 (56,70.1] : 1 (40,50] : 10 (40,50] : 10
Re-compose the dataframe with the discretized features:
<dataframe> <- cbind(<dataframe 1>[, c(<col11>, <col12>, ...)], <dataframe 2>[, c(<col21>, <col22>, ...)])
<dataframe> <- <dataframe>[, c(<col i>, <col k>, ...)] # rearrange columns (optional)
Alternatively:
<dataframe> <- <dataframe>[, names(<original dataframe>)] # rearrange columns (optional)
the.beatles.songs.nb <- cbind(the.beatles.songs[, c(1, 2, 5, 6, 10)], discretized.features)
the.beatles.songs.nb <- the.beatles.songs.nb[, names(the.beatles.songs)]
Split the dataset into train and test sets:
# install.packages("caret")
library(caret)
set.seed(<n>)
<train dataset indices> <- # stratified partitioning:
+ createDataPartition(<dataset>$<output variable>, # the same distribution of the output variable in both sets
+ p = .80, # 80/20% of data in train/test sets
+ list = FALSE) # don't make a list of results, make a matrix
<train dataset> <- <dataset>[<train dataset indices>, ]
<test dataset> <- <dataset>[-<train dataset indices>, ]
library(caret)
set.seed(4455)
# set.seed(333) - results in a different split, and different results and eval. metrics
train.data.indices <- createDataPartition(the.beatles.songs.nb$Top.50.BB, p = 0.80, list = FALSE)
train.data <- the.beatles.songs.nb[train.data.indices, ]
test.data <- the.beatles.songs.nb[-train.data.indices, ]
Build the model:
library(e1071)
?naiveBayes
<model> <- naiveBayes(<output variable> ~ ., # include all predictors from the training set
+ data = <training dataset>)
<model> <- naiveBayes(<output variable> ~
+ <var 1> + <var 2> + ..., # include only selected predictors from the training set
+ data = <training dataset>)
print<model> # shows P(o/c) for factor vars, mean() and sd() for numeric vars
library(e1071)
top.50.nb.1 <- naiveBayes(Top.50.BB ~ .,
data = train.data[, -1])
print(top.50.nb.1)
Naive Bayes Classifier for Discrete Predictors
Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)
A-priori probabilities:
Y
No Yes
0.8393574 0.1606426
Conditional probabilities:
Year
Y 1958 1960 1961 1962 1963 1964 1965 1966
No 0.009569378 0.019138756 0.004784689 0.052631579 0.244019139 0.124401914 0.105263158 0.047846890
Yes 0.000000000 0.000000000 0.025000000 0.075000000 0.100000000 0.250000000 0.175000000 0.150000000
Year
Y 1967 1968 1969 1970 1977 1980
No 0.090909091 0.157894737 0.138755981 0.004784689 0.000000000 0.000000000
Yes 0.075000000 0.050000000 0.100000000 0.000000000 0.000000000 0.000000000
Duration
Y [22.5,119] (119,215] (215,310] (310,406] (406,502]
No 0.148325359 0.755980861 0.081339713 0.009569378 0.004784689
Yes 0.050000000 0.825000000 0.100000000 0.000000000 0.025000000
Other.releases
Y [-0.056,11.2] (11.2,22.4] (22.4,33.6] (33.6,44.8] (44.8,56.1]
No 0.717703349 0.229665072 0.043062201 0.009569378 0.000000000
Yes 0.075000000 0.325000000 0.375000000 0.200000000 0.025000000
Single.certification
Y No RIAA 2xPlatinum RIAA 4xPlatinum RIAA Gold RIAA Gold, BPI Silver RIAA Platinum
No 0.961722488 0.009569378 0.000000000 0.014354067 0.004784689 0.009569378
Yes 0.175000000 0.075000000 0.025000000 0.575000000 0.000000000 0.150000000
Cover
Y N Y
No 0.7416268 0.2583732
Yes 0.8750000 0.1250000
Covered.by
Y [-0.07,14] (14,28] (28,42] (42,56] (56,70.1]
No 0.904306220 0.062200957 0.028708134 0.004784689 0.000000000
Yes 0.500000000 0.150000000 0.225000000 0.100000000 0.025000000
Top.50.Rolling.Stone
Y [-0.05,10] (10,20] (20,30] (30,40] (40,50]
No 0.947368421 0.009569378 0.019138756 0.014354067 0.009569378
Yes 0.525000000 0.100000000 0.100000000 0.125000000 0.150000000
Top.50.NME
Y [-0.05,10] (10,20] (20,30] (30,40] (40,50]
No 0.933014354 0.028708134 0.009569378 0.009569378 0.019138756
Yes 0.575000000 0.075000000 0.150000000 0.125000000 0.075000000
Make predictions:
<predictions> <- predict(object = <NB model>,
+ newdata = <test dataset>,
+ type = "class")
<predictions>[<i1>:<ik>] # examine some of the predictions
<predictions dataframe> <-
+ data.frame(<observation ID> = <test dataset>$<observation ID column>,
+ <another relevant feature> = <test dataset>$<another relevant feature column>,
+ ...,
+ <predictions feature> = <predictions>)
top.50.nb.predictions.1 <- predict(top.50.nb.1, newdata = test.data[, -1], type = "class")
top.50.nb.predictions.1[1:20]
[1] No No No Yes No No No No No Yes No No No No No No No Yes No No
Levels: No Yes
top.50.nb.predictions.1.dataframe <- data.frame(Song = test.data$Title,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.nb.predictions.1)
Compute confusion matrix:
<cm> <- table(True = <test dataset>$<output variable>,
+ Predicted = <test dataset>$<predictions>)
nb.cm.1 <- table(True = test.data$Top.50.BB, Predicted = top.50.nb.predictions.1)
nb.cm.1
Predicted
True No Yes
No 46 6
Yes 4 5
Compute evaluation metrics:
Note: precision and recall are inversely proportional to each other.
<evaluation metrics vector> <- <user-specified function>(<cm>)
# accuracy = sum(diag(cm)) / sum(cm)
# precision <- TP / (TP + FP)
# recall <- TP / (TP + FN)
# F1 <- (2 * precision * recall) / (precision + recall)
source("Evaluation metrics.R")
eval.nb.1 <- getEvaluationMetrics(nb.cm.1)
eval.nb.1
Accuracy Precision Recall F1
0.8360656 0.4545455 0.5555556 0.5000000
Build another model, using only selected predictors, and compute confusion matrix and evaluation metrics:
library(e1071)
top.50.nb.2 <- naiveBayes(Top.50.BB ~ Year + Duration + Other.releases,
data = train.data[, -1])
print(top.50.nb.2)
Naive Bayes Classifier for Discrete Predictors
Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)
A-priori probabilities:
Y
No Yes
0.8393574 0.1606426
Conditional probabilities:
Year
Y 1958 1960 1961 1962 1963 1964 1965 1966
No 0.009569378 0.019138756 0.004784689 0.052631579 0.244019139 0.124401914 0.105263158 0.047846890
Yes 0.000000000 0.000000000 0.025000000 0.075000000 0.100000000 0.250000000 0.175000000 0.150000000
Year
Y 1967 1968 1969 1970 1977 1980
No 0.090909091 0.157894737 0.138755981 0.004784689 0.000000000 0.000000000
Yes 0.075000000 0.050000000 0.100000000 0.000000000 0.000000000 0.000000000
Duration
Y [22.5,119] (119,215] (215,310] (310,406] (406,502]
No 0.148325359 0.755980861 0.081339713 0.009569378 0.004784689
Yes 0.050000000 0.825000000 0.100000000 0.000000000 0.025000000
Other.releases
Y [-0.056,11.2] (11.2,22.4] (22.4,33.6] (33.6,44.8] (44.8,56.1]
No 0.717703349 0.229665072 0.043062201 0.009569378 0.000000000
Yes 0.075000000 0.325000000 0.375000000 0.200000000 0.025000000
top.50.nb.predictions.2 <- predict(top.50.nb.2, newdata = test.data[, -1], type = "class")
top.50.nb.predictions.2[1:20]
[1] No No Yes No No No No No No No No No No No No No No No No No
Levels: No Yes
top.50.nb.predictions.2.dataframe <- data.frame(Song = test.data$Title,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.nb.predictions.2)
nb.cm.2 <- table(True = test.data$Top.50.BB, Predicted = top.50.nb.predictions.2)
nb.cm.2
Predicted
True No Yes
No 51 1
Yes 7 2
source("Evaluation metrics.R")
eval.nb.2 <- getEvaluationMetrics(nb.cm.2)
eval.nb.2
Accuracy Precision Recall F1
0.8688525 0.6666667 0.2222222 0.3333333
Compare the evaluation metrics of the two classifiers:
data.frame(rbind(<evaluation metrics 1>, <evaluation metrics 2>, ...),
+ row.names = c("<model 1>", "<model 2>", ...))
data.frame(rbind(eval.nb.1, eval.nb.2),
row.names = c("eval.nb.1", "eval.nb.2"))
Important concepts:
ROC curve is the plot representing the function: TPR = f(FPR). It can be used to select a probability threshold for the classifier (it does not have to be 0.5). To do this, making predictions is done by computing probabilities for each class value (suc as ‘Yes’ and ‘No’).
Build yet another model, to be used for plotting the ROC curve:
library(e1071)
?naiveBayes
<model> <- naiveBayes(<output variable> ~ ., # include all predictors from the training set
+ data = <training dataset>)
<model> <- naiveBayes(<output variable> ~
+ <var 1> + <var 2> + ..., # include only selected predictors from the training set
+ data = <training dataset>)
print<model> # shows P(o/c) for factor vars, mean() and sd() for numeric
library(e1071)
top.50.nb.3 <- naiveBayes(Top.50.BB ~ Year + Duration + Other.releases, # can be the same as for top.50.nb.2
data = train.data[, -1])
Make predictions as probabilities:
<predictions> <- predict(object = <NB model>,
+ newdata = <test dataset>,
+ type = "raw") # the value "raw" means "compute probabilities, not classes"
<predictions>[<i1>:<ik>] # examine some of the predictions
<predictions dataframe> <-
+ data.frame(<observation ID> = <test dataset>$<observation ID column>,
+ <another relevant feature> = <test dataset>$<another relevant feature column>,
+ ...,
+ <predictions feature> = <predictions>)
top.50.nb.predictions.3 <- predict(top.50.nb.3, newdata = test.data[, -1], type = "raw")
top.50.nb.predictions.3[1:20, ]
No Yes
[1,] 0.9649849 0.035015096
[2,] 0.9771835 0.022816472
[3,] 0.2784095 0.721590502
[4,] 0.6273678 0.372632229
[5,] 0.9845139 0.015486143
[6,] 0.8919640 0.108036045
[7,] 0.9951647 0.004835349
[8,] 0.9698362 0.030163823
[9,] 0.8919640 0.108036045
[10,] 0.8064744 0.193525575
[11,] 0.9931359 0.006864109
[12,] 0.8064744 0.193525575
[13,] 0.6705253 0.329474707
[14,] 0.9579813 0.042018747
[15,] 0.9931359 0.006864109
[16,] 0.6273678 0.372632229
[17,] 0.9845139 0.015486143
[18,] 0.9359577 0.064042299
[19,] 0.9760018 0.023998163
[20,] 0.9931359 0.006864109
top.50.nb.predictions.3.dataframe <- data.frame(Song = test.data$Title,
Top.50.BB = test.data$Top.50.BB,
Prediction.probability.No = top.50.nb.predictions.3[, 1],
Prediction.probability.Yes = top.50.nb.predictions.3[, 2])
Compute ROC curve parameters, the area under the curve (AUC), and plot the curve:
library(pROC)
<ROC curve parameters> <- # compute ROC curve parameters
+ roc(response = <test dataset>$<output variable>,
+ predictor = <predicted probabilities>[, <1 | 2>]) # col. no. of the "positive class" (can be the No class!)
<ROC curve parameters>$auc # extract and show AUC
library(pROC)
top.50.nb.predictions.3.roc <-
roc(response = test.data$Top.50.BB,
predictor = top.50.nb.predictions.3[, 2])
top.50.nb.predictions.3.roc$auc
Area under the curve: 0.7618
Plot the ROC curve:
plot.roc(<ROC curve parameters>, # computed in the previous step
+ print.thres = TRUE, # show the probability threshold (cut-off point) on the plot
+ print.thres.best.method =
+ "youden" | # maximize the sum of sensitivity and specificity (the distance to the diag. line)
+ "closest.topleft") # minimize the distance to the top-left point of the plot
plot.roc(top.50.nb.predictions.3.roc,
print.thres = TRUE,
print.thres.best.method = "youden")
Getting the probability threshold and other ROC curve parameters using pROC::coords():
<ROC coords> <- coords(<ROC curve parameters>, # computed in the previous step
+ ret = c("accuracy", "spec", "sens", "thr", ...), # ROC curve parameters to return
+ x = # the coordinates to look for:
+ "local maximas" | # local maximas of the ROC curve
+ "best" | ...) # the point with the best sum of sensitivity and specificity,
+ # i.e. the same as the one shown on the ROC curve <ROC coords>
top.50.nb.predictions.3.coords <-
coords(top.50.nb.predictions.3.roc,
ret = c("accuracy", "spec", "sens", "thr"),
x = "local maximas")
top.50.nb.predictions.3.coords
local maximas local maximas local maximas local maximas local maximas local maximas local maximas
accuracy 0.52459016 0.68852459 0.7377049 0.7540984 0.7540984 0.8524590 0.8688525
specificity 0.44230769 0.67307692 0.7500000 0.7884615 0.8076923 0.9423077 0.9807692
sensitivity 1.00000000 0.77777778 0.6666667 0.5555556 0.4444444 0.3333333 0.2222222
threshold 0.02340732 0.05303052 0.1017091 0.1418192 0.1903378 0.3474709 0.4897984
local maximas
accuracy 0.852459
specificity 1.000000
sensitivity 0.000000
threshold Inf
10-fold cross-validation: https://www.openml.org/a/estimation-procedures/1